/*******************************************************************************
Last updated: February 22 2023
********************************************************************************/

cls
clear all
local dir "~\Dropbox\Working Papers\Distinction Effect"
cd "`dir'"

qui do "Does/Data Processing/0. Cleaning Programs.do"

/*******************************************************************************
			1. Universe : College Graduates between 2001 - 2015-I
********************************************************************************/

import delimited "Data\Originals\OLE\laboral_egresados_2001_2015.csv", encoding("utf-8")  stringc(_all) clear

rename idprograma ole_programcode
rename idmodalidad ole_programlevel
rename idarea ole_programarea
rename idnucleo ole_programfield
rename primernombre ole_name1
rename segundonombre ole_name2
rename primerapellido ole_surname1
rename segundoapellido ole_surname2
rename fechanacimiento birthdate
rename genero gender
rename idtipodocumento national_id_type
rename numdocumento national_id
rename fechagrado ole_graddate

gen ole_gradtime = anio_grado+semestre_grado

tempfile Universe
preserve 
	keep national_id_type national_id ole_name1 ole_name2 ole_surname1 ole_surname2 birthdate gender
	duplicates drop national_id_type national_id, force
	save `Universe', replace
restore

//Information on Undergraduae Programs (2/3-years and 4/5-years)
tempfile UndergradPrograms
preserve
	destring ole_programlevel, replace
	keep if ole_programlevel == 4 | ole_programlevel == 5 | ole_programlevel == 6
	keep national_id_type national_id ole_gradtime ole_graddate ole_programcode ole_programlevel ole_programarea ole_programfield 
	
	destring ole_gradtime, replace	
	gen aux = ole_gradtime if ole_programlevel == 4 | ole_programlevel == 5
	bys national_id_type national_id : egen fst_grad2y = min(aux)
	drop aux
	gen aux = ole_gradtime if ole_programlevel == 6
	bys national_id_type national_id : egen fst_grad4y = min(aux)
	drop aux	
	egen fst_grad = rowmin(fst_grad2y fst_grad4y)
	
	rename ole_graddate aux
	gen ole_graddate = date(aux, "DMY")
	drop aux
	gen aux = ole_graddate if ole_programlevel == 4 | ole_programlevel == 5
	bys national_id_type national_id : egen fst_graddate2y = min(aux)
	format fst_graddate2y %td
	drop aux
	gen aux = ole_graddate if ole_programlevel == 6
	bys national_id_type national_id : egen fst_graddate4y = min(aux)
	format fst_graddate4y %td
	drop aux	
	egen fst_graddate = rowmin(fst_graddate2y fst_graddate4y)
	format fst_graddate %td
	drop ole_graddate
		
	gsort national_id_type national_id -ole_programlevel ole_gradtime	
	tostring ole_programlevel ole_gradtime, replace
	
	by national_id_type national_id : egen t = seq()
	reshape wide ole_gradtime ole_programcode ole_programlevel ole_programarea ole_programfield, i(national_id_type national_id) j(t)
	
	foreach var in ole_gradtime ole_programcode ole_programlevel ole_programarea ole_programfield {
		foreach t of numlist 4(1)12 {
			replace `var'3 = `var'3 + "," + `var'`t' if `var'3 != "" & `var'`t' != ""
		}
	}
	drop ole_programcode4-ole_gradtime12
	
	gsort -ole_gradtime3 -ole_gradtime2
	
	order national_id_type national_id fst_grad*
	
	save `UndergradPrograms', replace
restore

/*******************************************************************************
					2. Graduate Education Information
********************************************************************************/

//Information Graduate Programs
tempfile GradPrograms
preserve
	destring ole_programlevel, replace
	keep if ole_programlevel == 1 | ole_programlevel == 2 | ole_programlevel == 3
	keep national_id_type national_id ole_gradtime ole_programcode ole_programlevel ole_programarea ole_programfield
	
	destring ole_gradtime, replace	
	bys national_id_type national_id : egen fst_g_gradsch = min(ole_gradtime)
	
	rename ole_gradtime ole_g_gradtime
	rename ole_programcode ole_g_programcode 
	rename ole_programlevel ole_g_programlevel
	rename ole_programarea ole_g_programarea
	rename ole_programfield ole_g_programfield
	
	gsort national_id_type national_id ole_g_gradtime -ole_g_programlevel
	tostring ole_g_programlevel ole_g_gradtime, replace

	by national_id_type national_id : egen t = seq()
	reshape wide ole_g_gradtime ole_g_programcode ole_g_programlevel ole_g_programarea ole_g_programfield, i(national_id_type national_id) j(t)
	
	foreach var in ole_g_gradtime ole_g_programcode ole_g_programlevel ole_g_programarea ole_g_programfield {
		foreach t of numlist 4(1)8 {
			replace `var'3 = `var'3 + "," + `var'`t' if `var'3 != "" & `var'`t' != ""
		}
	}
	drop ole_g_programcode4-ole_g_gradtime8
	
	gsort -ole_g_gradtime3 -ole_g_gradtime2
	
	order national_id_type national_id fst_g_gradsch
	
	save `GradPrograms', replace
restore

use `Universe', clear 
merge 1:1 national_id_type national_id using `UndergradPrograms'
drop _merge
merge 1:1 national_id_type national_id using `GradPrograms'
drop _merge

save `Universe', replace

/*******************************************************************************
						3. Labor Market Information
********************************************************************************/

//Labor Market Information
foreach year of numlist 2007(1)2016 {
    *local year 2014
	import delimited "Data\Originals\OLE\laboral_egresados_2001_`year'.csv", encoding("utf-8")  stringc(_all) clear
		
	rename idtipodocumento national_id_type
	rename numdocumento national_id
	rename idtipocotizante ole_workertype
	rename idclase ole_ciiu
	rename idmunicipio_cotiz ole_municipality
	rename ingresobc ole_earnings
	rename cotizante ole_formalworker

	global LaborVars ole_workertype ole_ciiu ole_municipality ole_earnings ole_formalworker
	destring $LaborVars, replace

	foreach var in $LaborVars {
	rename `var' aux
	bys national_id_type national_id : egen `var'_`year' = max(aux) //No diff using min()
	drop aux
	}
	keep national_id_type national_id $LaborVars
	duplicates drop national_id_type national_id, force

	tempfile LaborMarket`year'	
	save `LaborMarket`year'', replace
	
	preserve 
		use `Universe', replace
		merge 1:1 national_id national_id_type using `LaborMarket`year''
		keep if _merge == 3 | _merge == 1
		drop _merge
		save `Universe', replace
	restore	
}

use `Universe', clear

//Cleaning 

//Birthdate
rename birthdate aux 
gen birthdate = date(aux, "DMY")
format birthdate %td
drop aux

preserve
	keep if national_id_type == "CC"
	rename national_id id
	gen national_id = id 
	destring national_id, replace force
	drop if national_id == .
	duplicates drop national_id, force
	joinby national_id using "Data/Originals/Registraduria/Registraduria_CC.dta"
	
	gen reg_name = reg_nombre1 + " " + reg_nombre2 + " " + reg_apellido1 + " " + reg_apellido2
	keep national_id_type id reg_birth_date reg_name
	spell reg_name, clean(reg_name) up(reg_name)
	drop if reg_birth_date == .
	rename id national_id
	
	tempfile Birthdates
	save `Birthdates', replace
restore

merge 1:1 national* using `Birthdates'
keep if _merge == 3 | _merge == 1
drop _merge

gen ole_name = ole_name1 + " " + ole_name2 + " " + ole_surname1 + " " + ole_surname2
spell ole_name, clean(ole_name) up(ole_name)

matchit ole_name reg_name, sim(ngram,2) generate(phone_score)
replace reg_birth_date = . if phone_score < 0.8
replace birthdate = reg_birth_date if birthdate == .
drop reg_name reg_birth_date


preserve
	keep if national_id_type == "CC"
	keep if birthdate == .
	gen numero = national_id
	destring numero, replace force
	drop if numero == .
	gen sex = (gender == "M") if gender != ""
	drop if sex == .
	duplicates drop numero, force
	
	keep national* numero sex
	joinby numero sex using "Data/Originals/PILA/Pila 2010.dta"
	keep if tipo == 1
	drop if birth == .
	drop if numero <= 5000000
	
	keep national_id* birth
	tempfile BirthdatesPila
	save `BirthdatesPila'
restore

merge 1:1 national* using `BirthdatesPila'
keep if _merge == 3 | _merge == 1
drop _merge

replace birthdate = birth if birthdate == .
drop birth

preserve
	keep if birthdate == .	
	gen doc_num = national_id
	gen doc_tipo = national_id_type
	replace doc_tipo = "C" if doc_tipo == "CC"
	replace doc_tipo = "T" if doc_tipo == "TI"
	keep if doc_tipo == "T" | doc_tipo == "C"
		
	destring doc_num, replace force
	drop if doc_num == .
	gen sexo = (gender == "F") if gender != ""
	drop if sexo == .
	duplicates drop doc_tipo doc_num, force	
	
	keep national* doc_tipo doc_num sexo ole_name
	joinby doc_tipo doc_num sexo using "Data\Originals\SPADIES\Spadies 2016.dta"
	gen spa_name = nombre + " " + apellido
	spell spa_name, clean(spa_name) up(spa_name)
	duplicates drop doc_tipo doc_num spa_name, force
	
	matchit ole_name spa_name, sim(ngram,2) generate(phone_score)
	drop if phone_score < 0.8	
	
	tostring nac_dia nac_mes nac_ano, replace force
	gen spa_birthdate = date(nac_dia +"/"+ nac_mes +"/"+ nac_ano, "DMY")
	format spa_birthdate %td
	drop if spa_birthdate == .
	
	duplicates tag national*, gen(dup)
	drop if dup > 0
	
	keep national* spa_birthdate
	tempfile BirthdatesSpadies
	save `BirthdatesSpadies'	
restore

merge 1:1 national* using `BirthdatesSpadies'
keep if _merge == 3 | _merge == 1
drop _merge

replace birthdate = spa_birthdate if birthdate == .
drop spa_birthdate
drop ole_name

rename birthdate ole_birthdate
order ole_birthdate, before(gender)

save "Data\Intermediates\OLE_2007_2016.dta", replace


